In [60]:
%matplotlib inline
from preamble import *
In [61]:
import pandas as pd
# The file has no headers naming the columns, so we pass header=None and provide the column names explicitly in "names"
data = pd.read_csv("data/adult.data", header=None, index_col=False,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
# For illustration purposes, we only select some of the columns:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
# print the first 5 rows
data.head()
Out[61]:
In [62]:
data.gender.value_counts()
Out[62]:
In [63]:
print("Original features:\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))
In [64]:
data_dummies.head()
Out[64]:
In [65]:
# Get only the columns containing features, that is all columns from 'age' to 'occupation_ Transport-moving'
# This range contains all the features but not the target
features = data_dummies.ix[:, 'age':'occupation_ Transport-moving']
# extract numpy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print(X.shape, y.shape)
In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_test, y_test))
In [67]:
# create a dataframe with an integer feature and a categorical string feature
demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1], 'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
demo_df
Out[67]:
In [68]:
pd.get_dummies(demo_df)
Out[68]:
In [69]:
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
pd.get_dummies(demo_df)
Out[69]:
In [72]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
X, y = mglearn.datasets.make_wave(n_samples=100)
plt.plot(X[:, 0], y, 'o')
line = np.linspace(-3, 3, 1000)[:-1].reshape(-1, 1)
reg = LinearRegression().fit(X, y)
plt.plot(line, reg.predict(line), label="linear regression")
reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y)
plt.plot(line, reg.predict(line), label="decision tree")
plt.ylabel("regression output")
plt.xlabel("input feature")
plt.legend(loc="best")
Out[72]:
In [73]:
np.set_printoptions(precision=2)
bins = np.linspace(-3, 3, 11)
bins
Out[73]:
In [74]:
which_bin = np.digitize(X, bins=bins)
print("\nData points:\n", X[:5])
print("\nBin membership for data points:\n", which_bin[:5])
In [14]:
from sklearn.preprocessing import OneHotEncoder
# transform using the OneHotEncoder.
encoder = OneHotEncoder(sparse=False)
# encoder.fit finds the unique values that appear in which_bin
encoder.fit(which_bin)
# transform creates the one-hot encoding
X_binned = encoder.transform(which_bin)
print(X_binned[:5])
In [75]:
X_binned.shape
Out[75]:
In [76]:
line_binned = encoder.transform(np.digitize(line, bins=bins))
plt.plot(X[:, 0], y, 'o')
reg = LinearRegression().fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='linear regression binned')
reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='decision tree binned')
for bin in bins:
plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")
plt.suptitle("linear_binning")
Out[76]:
In [77]:
X_combined = np.hstack([X, X_binned])
print(X_combined.shape)
In [78]:
plt.plot(X[:, 0], y, 'o')
reg = LinearRegression().fit(X_combined, y)
line_combined = np.hstack([line, line_binned])
plt.plot(line, reg.predict(line_combined), label='linear regression combined')
for bin in bins:
plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")
Out[78]:
In [79]:
X_product = np.hstack([X_binned, X * X_binned])
print(X_product.shape)
In [80]:
plt.plot(X[:, 0], y, 'o')
reg = LinearRegression().fit(X_product, y)
line_product = np.hstack([line_binned, line * line_binned])
plt.plot(line, reg.predict(line_product), label='linear regression combined')
for bin in bins:
plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")
Out[80]:
In [81]:
from sklearn.preprocessing import PolynomialFeatures
# include polynomials up to x ** 10:
poly = PolynomialFeatures(degree=10)
poly.fit(X)
X_poly = poly.transform(X)
In [82]:
X_poly.shape
Out[82]:
In [84]:
poly.get_feature_names()
Out[84]:
In [85]:
plt.plot(X[:, 0], y, 'o')
reg = LinearRegression().fit(X_poly, y)
line_poly = poly.transform(line)
plt.plot(line, reg.predict(line_poly), label='polynomial linear regression')
plt.legend(loc="best")
Out[85]:
In [86]:
from sklearn.svm import SVR
plt.plot(X[:, 0], y, 'o')
for gamma in [1, 10]:
svr = SVR(gamma=gamma).fit(X, y)
plt.plot(line, svr.predict(line), label='SVR gamma=%d' % gamma)
plt.legend(loc="best")
Out[86]:
In [87]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)
# rescale data:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [88]:
poly = PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
print(X_train.shape)
print(X_train_poly.shape)
In [89]:
print(poly.get_feature_names())
In [90]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train_scaled, y_train)
print("score without interactions: %f" % ridge.score(X_test_scaled, y_test))
ridge = Ridge().fit(X_train_poly, y_train)
print("score with interactions: %f" % ridge.score(X_test_poly, y_test))
In [91]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train)
print("score without interactions: %f" % rf.score(X_test_scaled, y_test))
rf = RandomForestRegressor(n_estimators=100).fit(X_train_poly, y_train)
print("score with interactions: %f" % rf.score(X_test_poly, y_test))
In [93]:
rf.apply(X_test_poly)
Out[93]:
In [94]:
rf.apply(X_test_poly).shape
Out[94]:
In [95]:
rnd = np.random.RandomState(0)
X_org = rnd.normal(size=(1000, 3))
w = rnd.normal(size=3)
X = np.random.poisson(10 * np.exp(X_org))
y = np.dot(X_org, w)
In [96]:
np.bincount(X[:, 0])
Out[96]:
In [97]:
bins = np.bincount(X[:, 0])
plt.bar(range(len(bins)), bins, color='w')
plt.ylabel("number of appearances")
plt.xlabel("value")
Out[97]:
In [98]:
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Ridge().fit(X_train, y_train).score(X_test, y_test)
Out[98]:
In [99]:
X_train_log = np.log(X_train + 1)
X_test_log = np.log(X_test + 1)
In [101]:
plt.hist(np.log(X_train_log[:, 0] + 1), bins=25, color='w');
In [102]:
Ridge().fit(X_train_log, y_train).score(X_test_log, y_test)
Out[102]:
In [103]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])
X_train, X_test, y_train, y_test = train_test_split(
X_w_noise, cancer.target, random_state=0, test_size=.5)
# use f_classif (the default) and SelectPercentile to select 10% of features:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set:
X_train_selected = select.transform(X_train)
print(X_train.shape)
print(X_train_selected.shape)
In [104]:
from sklearn.feature_selection import f_classif, f_regression, chi2
In [105]:
F, p = f_classif(X_train, y_train)
In [106]:
plt.figure()
plt.plot(p, 'o')
Out[106]:
In [107]:
mask = select.get_support()
print(mask)
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
Out[107]:
In [108]:
from sklearn.linear_model import LogisticRegression
# transform test data:
X_test_selected = select.transform(X_test)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Score with all features: %f" % lr.score(X_test, y_test))
lr.fit(X_train_selected, y_train)
print("Score with only selected features: %f" % lr.score(X_test_selected, y_test))
In [109]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")
In [110]:
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print(X_train.shape)
print(X_train_l1.shape)
In [111]:
mask = select.get_support()
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
Out[111]:
In [112]:
X_test_l1 = select.transform(X_test)
LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
Out[112]:
In [113]:
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)
#select = RFE(LogisticRegression(penalty="l1"), n_features_to_select=40)
select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
Out[113]:
In [114]:
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)
LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)
Out[114]:
In [115]:
select.score(X_test, y_test)
Out[115]:
In [116]:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(LogisticRegression(), k_features=40,
forward=True, scoring='accuracy',cv=5)
sfs = sfs.fit(X_train, y_train)
In [117]:
mask = np.zeros(80, dtype='bool')
mask[np.array(sfs.k_feature_idx_)] = True
In [118]:
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
Out[118]:
In [59]:
LogisticRegression().fit(sfs1.transform(X_train), y_train).score(sfs.transform(X_test), y_test)
Out[59]:
In [123]:
data = pd.read_csv("data/adult.data", header=None, index_col=False,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
y = data.income.values
X = pd.get_dummies(data.drop("income", axis=1))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = MinMaxScaler().fit(X_train)
X_train_ = scaler.transform(X_train)
X_test_ = scaler.transform(X_test)
In [124]:
LogisticRegression().fit(X_train_, y_train).score(X_test_, y_test)
Out[124]:
In [125]:
X_train.shape
Out[125]:
In [178]:
select = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold="5 * median")
X_train_selected = select.fit_transform(X_train_, y_train)
X_test_selected = select.transform(X_test_)
In [179]:
LogisticRegression().fit(X_train_selected, y_train).score(X_test_selected, y_test)
Out[179]:
In [180]:
X_train_selected.shape
Out[180]:
In [197]:
poly = PolynomialFeatures(degree=2).fit(X_train_selected)
X_train_selected_poly = poly.transform(X_train_selected)
X_test_selected_poly = poly.transform(X_test_selected)
In [200]:
lr = LogisticRegression(C=0.01, penalty="l1").fit(X_train_selected_poly, y_train)
lr.score(X_test_selected_poly, y_test)
Out[200]:
In [201]:
np.array(poly.get_feature_names(X.columns[select.get_support()]))[lr.coef_.ravel() != 0]
Out[201]:
In [ ]: